# Copyright 2008-2018 pydicom authors. See LICENSE file for details.
"""Unit tests for the pydicom.charset module."""

import pytest

import pydicom.charset
from pydicom import dcmread
from pydicom.data import get_charset_files, get_testdata_file
from pydicom.dataelem import DataElement
from pydicom.filebase import DicomBytesIO
from pydicom.valuerep import PersonName

# The file names (without '.dcm' extension) of most of the character test
# files, together with the respective decoded PatientName tag values.
# Most of these (except the Korean file) are taken from David Clunie's
# charset example files.
FILE_PATIENT_NAMES = [
    ("chrArab", "قباني^لنزار"),
    ("chrFren", "Buc^Jérôme"),
    ("chrFrenMulti", "Buc^Jérôme"),
    ("chrGerm", "Äneas^Rüdiger"),
    ("chrGreek", "Διονυσιος"),
    ("chrH31", "Yamada^Tarou=山田^太郎=やまだ^たろう"),
    ("chrH32", "ﾔﾏﾀﾞ^ﾀﾛｳ=山田^太郎=やまだ^たろう"),
    ("chrHbrw", "שרון^דבורה"),
    ("chrI2", "Hong^Gildong=洪^吉洞=홍^길동"),
    ("chrJapMulti", "やまだ^たろう"),
    ("chrJapMultiExplicitIR6", "やまだ^たろう"),
    ("chrKoreanMulti", "김희중"),
    ("chrRuss", "Люкceмбypг"),
    ("chrX1", "Wang^XiaoDong=王^小東"),
    ("chrX2", "Wang^XiaoDong=王^小东"),
]

# Test data for all single-byte coding extensions.
# Mostly taken from the same example files.
ENCODED_NAMES = [
    (
        "ISO 2022 IR 13",
        "ﾔﾏﾀﾞ^ﾀﾛｳ",
        b"\x1b\x29\x49\xd4\xcf\xc0\xde\x5e\x1b\x29\x49\xc0\xdb\xb3",
    ),
    (
        "ISO 2022 IR 100",
        "Buc^Jérôme",
        b"\x1b\x2d\x41\x42\x75\x63\x5e\x1b\x2d\x41\x4a\xe9\x72\xf4\x6d\x65",
    ),
    ("ISO 2022 IR 101", "Wałęsa", b"\x1b\x2d\x42\x57\x61\xb3\xea\x73\x61"),
    (
        "ISO 2022 IR 109",
        "antaŭnomo",
        b"\x1b\x2d\x43\x61\x6e\x74\x61\xfd\x6e\x6f\x6d\x6f",
    ),
    ("ISO 2022 IR 110", "vārds", b"\x1b\x2d\x44\x76\xe0\x72\x64\x73"),
    (
        "ISO 2022 IR 127",
        "قباني^لنزار",
        b"\x1b\x2d\x47\xe2\xc8\xc7\xe6\xea\x5e\x1b\x2d\x47\xe4\xe6\xd2\xc7\xd1",
    ),
    (
        "ISO 2022 IR 126",
        "Διονυσιος",
        b"\x1b\x2d\x46\xc4\xe9\xef\xed\xf5\xf3\xe9\xef\xf2",
    ),
    (
        "ISO 2022 IR 138",
        "שרון^דבורה",
        b"\x1b\x2d\x48\xf9\xf8\xe5\xef\x5e\x1b\x2d\x48\xe3\xe1\xe5\xf8\xe4",
    ),
    (
        "ISO 2022 IR 144",
        "Люкceмбypг",
        b"\x1b\x2d\x4c\xbb\xee\xda\x63\x65\xdc\xd1\x79\x70\xd3",
    ),
    (
        "ISO 2022 IR 148",
        "Çavuşoğlu",
        b"\x1b\x2d\x4d\xc7\x61\x76\x75\xfe\x6f\xf0\x6c\x75",
    ),
    ("ISO 2022 IR 166", "นามสกุล", b"\x1b\x2d\x54\xb9\xd2\xc1\xca\xa1\xd8\xc5"),
]


class TestCharset:
    def test_encodings(self):
        test_string = "Hello World"
        for x in pydicom.charset.python_encoding.items():
            test_string.encode(x[1])

    def test_nested_character_sets(self):
        """charset: can read and decode SQ with different encodings........."""
        ds = dcmread(get_charset_files("chrSQEncoding.dcm")[0])
        ds.decode()

        # These datasets inside of the SQ cannot be decoded with
        # default_encoding OR UTF-8 (the parent dataset's encoding).
        # Instead, we make sure that it is decoded using the
        # (0008,0005) tag of the dataset

        sequence = ds[0x32, 0x1064][0]
        assert ["shift_jis", "iso2022_jp"] == sequence._character_set
        assert "ﾔﾏﾀﾞ^ﾀﾛｳ=山田^太郎=やまだ^たろう" == sequence.PatientName

    def test_inherited_character_set_in_sequence(self):
        """charset: can read and decode SQ with parent encoding............."""
        ds = dcmread(get_charset_files("chrSQEncoding1.dcm")[0])
        ds.decode()

        # These datasets inside of the SQ shall be decoded with the parent
        # dataset's encoding
        sequence = ds[0x32, 0x1064][0]
        assert ["shift_jis", "iso2022_jp"] == sequence._character_set
        assert "ﾔﾏﾀﾞ^ﾀﾛｳ=山田^太郎=やまだ^たろう" == sequence.PatientName

    def test_standard_file(self):
        """charset: can read and decode standard file without special char.."""
        ds = dcmread(get_testdata_file("CT_small.dcm"))
        ds.decode()
        assert "CompressedSamples^CT1" == ds.PatientName

    def test_invalid_character_set(self, allow_reading_invalid_values):
        """charset: replace invalid encoding with default encoding"""
        ds = dcmread(get_testdata_file("CT_small.dcm"))
        ds._read_charset = None
        ds.SpecificCharacterSet = "UNSUPPORTED"
        with pytest.warns(
            UserWarning,
            match=("Unknown encoding 'UNSUPPORTED' - using default encoding instead"),
        ):
            ds.decode()
            assert "CompressedSamples^CT1" == ds.PatientName

    def test_invalid_character_set_enforce_valid(self, enforce_valid_values):
        """charset: raise on invalid encoding"""
        ds = dcmread(get_testdata_file("CT_small.dcm"))
        ds._read_charset = None
        ds.SpecificCharacterSet = "UNSUPPORTED"
        with pytest.raises(LookupError, match="Unknown encoding 'UNSUPPORTED'"):
            ds.decode()

    def test_decoding_with_specific_tags(self):
        """Decoding is correctly applied even if  Specific Character Set
        is not in specific tags..."""
        rus_file = get_charset_files("chrRuss.dcm")[0]
        ds = dcmread(rus_file, specific_tags=["PatientName"])
        ds.decode()
        assert 2 == len(ds)  # specific character set is always decoded
        assert "Люкceмбypг" == ds.PatientName

    def test_bad_charset(self):
        """Test bad charset defaults to ISO IR 6"""
        # elem.value is PersonName
        elem = DataElement(0x00100010, "PN", "CITIZEN")
        pydicom.charset.decode_element(elem, ["ISO 2022 IR 126"])
        assert "iso_ir_126" in elem.value.encodings
        assert "iso8859" not in elem.value.encodings
        # default encoding is iso8859
        pydicom.charset.decode_element(elem, [])
        assert "iso8859" in elem.value.encodings

    def test_empty_charset(self):
        """Empty charset defaults to ISO IR 6"""
        elem = DataElement(0x00100010, "PN", "CITIZEN")
        pydicom.charset.decode_element(elem, [""])
        assert ("iso8859",) == elem.value.encodings
        elem = DataElement(0x00100010, "PN", "CITIZEN")
        pydicom.charset.decode_element(elem, None)
        assert ("iso8859",) == elem.value.encodings

    def test_bad_encoded_single_encoding(self, allow_reading_invalid_values):
        """Test handling bad encoding for single encoding"""
        elem = DataElement(0x00100010, "PN", b"\xc4\xe9\xef\xed\xf5\xf3\xe9\xef\xf2")

        with pytest.warns(
            UserWarning, match="Failed to decode byte string with encoding 'UTF8'"
        ):
            pydicom.charset.decode_element(elem, ["ISO_IR 192"])
            assert "���������" == elem.value

    def test_bad_encoded_single_encoding_enforce_standard(self, enforce_valid_values):
        """Test handling bad encoding for single encoding if
        config.settings.reading_validation_mode is RAISE."""
        elem = DataElement(0x00100010, "PN", b"\xc4\xe9\xef\xed\xf5\xf3\xe9\xef\xf2")
        msg = (
            "'utf.?8' codec can't decode byte 0xc4 in position 0: "
            "invalid continuation byte"
        )
        with pytest.raises(UnicodeDecodeError, match=msg):
            pydicom.charset.decode_element(elem, ["ISO_IR 192"])

    def test_code_extensions_not_allowed(self):
        """Test that UTF8 does not allow code extensions"""
        elem = DataElement(0x00100010, "PN", b"Buc^J\xc3\xa9r\xc3\xb4me")

        msg = (
            "Value 'ISO_IR 192' for Specific Character Set does not "
            "allow code extensions, ignoring: ISO 2022 IR 100, "
            "ISO 2022 IR 144"
        )
        with pytest.warns(UserWarning, match=msg):
            pydicom.charset.decode_element(
                elem, ["ISO_IR 192", "ISO 2022 IR 100", "ISO 2022 IR 144"]
            )
            assert "Buc^Jérôme" == elem.value

    def test_convert_encodings_warnings(self):
        """Test warning if stand-alone encodings are used as code extension"""
        msg_iso = "Value 'ISO_IR 192' cannot be used as code extension, ignoring it"
        msg_gbk = "Value 'GBK' cannot be used as code extension, ignoring it"
        with pytest.warns(UserWarning, match=msg_iso):
            with pytest.warns(UserWarning, match=msg_gbk):
                encodings = pydicom.charset.convert_encodings(
                    ["ISO_IR 126", "GBK", "ISO 2022 IR 144", "ISO_IR 192"]
                )
                assert ["iso_ir_126", "iso_ir_144"] == encodings

    def test_convert_python_encodings(self):
        """Test that unknown encodings are returned unchanged by
        `convert_encodings`"""
        encodings = ["iso_ir_126", "iso_ir_144"]
        assert encodings == pydicom.charset.convert_encodings(encodings)

    def test_convert_empty_encoding(self):
        """Test that empty encodings are handled as default encoding"""
        encodings = ""
        assert ["iso8859"] == pydicom.charset.convert_encodings(encodings)
        encodings = [""]
        assert ["iso8859"] == pydicom.charset.convert_encodings(encodings)
        encodings = None
        assert ["iso8859"] == pydicom.charset.convert_encodings(encodings)

    def test_bad_decoded_multi_byte_encoding(self, allow_reading_invalid_values):
        """Test handling bad encoding for single encoding"""
        elem = DataElement(
            0x00100010, "PN", b"\x1b$(D\xc4\xe9\xef\xed\xf5\xf3\xe9\xef\xf2"
        )

        with pytest.warns(
            UserWarning,
            match="Failed to decode byte string with encodings: iso2022_jp_2",
        ):
            pydicom.charset.decode_element(elem, ["ISO 2022 IR 159"])
            assert "���������" == elem.value

    def test_bad_decoded_multi_byte_encoding_enforce_standard(
        self, enforce_valid_values
    ):
        """Test handling bad encoding for single encoding if
        `config.settings.reading_validation_mode` is RAISE"""
        elem = DataElement(
            0x00100010, "PN", b"\x1b$(D\xc4\xe9\xef\xed\xf5\xf3\xe9\xef\xf2"
        )
        msg = (
            "'iso2022_jp_2' codec can't decode byte 0xc4 in position 4: "
            "illegal multibyte sequence"
        )
        with pytest.raises(UnicodeDecodeError, match=msg):
            pydicom.charset.decode_element(elem, ["ISO 2022 IR 159"])

    def test_unknown_escape_sequence(self, allow_reading_invalid_values):
        """Test handling bad encoding for single encoding"""
        elem = DataElement(
            0x00100010, "PN", b"\x1b\x2d\x46\xc4\xe9\xef\xed\xf5\xf3\xe9\xef\xf2"
        )

        with pytest.warns(
            UserWarning,
            match="Found unknown escape sequence in encoded string value",
        ):
            pydicom.charset.decode_element(elem, ["ISO_IR 100"])
            assert "\x1b-FÄéïíõóéïò" == elem.value

    def test_unknown_escape_sequence_enforce_standard(self, enforce_valid_values):
        """Test handling bad encoding for single encoding if
        `config.settings.reading_validation_mode` is RAISE"""
        elem = DataElement(
            0x00100010, "PN", b"\x1b\x2d\x46\xc4\xe9\xef\xed\xf5\xf3\xe9\xef\xf2"
        )
        with pytest.raises(
            ValueError, match="Found unknown escape sequence in encoded string value"
        ):
            pydicom.charset.decode_element(elem, ["ISO_IR 100"])

    def test_patched_charset(self, allow_reading_invalid_values):
        """Test some commonly misspelled charset values"""
        elem = DataElement(0x00100010, "PN", b"Buc^J\xc3\xa9r\xc3\xb4me")
        pydicom.charset.decode_element(elem, ["ISO_IR 192"])
        # correct encoding
        assert "Buc^Jérôme" == elem.value

        # patched encoding shall behave correctly, but a warning is issued
        elem = DataElement(0x00100010, "PN", b"Buc^J\xc3\xa9r\xc3\xb4me")
        with pytest.warns(
            UserWarning,
            match="Incorrect value for Specific Character Set "
            "'ISO IR 192' - assuming 'ISO_IR 192'",
        ):
            pydicom.charset.decode_element(elem, ["ISO IR 192"])
            assert "Buc^Jérôme" == elem.value

        elem = DataElement(0x00100010, "PN", b"Buc^J\xe9r\xf4me")
        with pytest.warns(
            UserWarning,
            match="Incorrect value for Specific Character Set "
            "'ISO-IR 144' - assuming 'ISO_IR 144'",
        ) as w:
            pydicom.charset.decode_element(elem, ["ISO_IR 100", "ISO-IR 144"])
            # make sure no warning is issued for the correct value
            assert 1 == len(w)

        # not patched incorrect encoding is replaced by default encoding
        elem = DataElement(0x00100010, "PN", b"Buc^J\xc3\xa9r\xc3\xb4me")
        with pytest.warns(
            UserWarning,
            match="Unknown encoding 'ISOIR 192' - using default encoding instead",
        ):
            pydicom.charset.decode_element(elem, ["ISOIR 192"])

        # Python encoding also can be used directly
        elem = DataElement(0x00100010, "PN", b"Buc^J\xc3\xa9r\xc3\xb4me")
        pydicom.charset.decode_element(elem, ["utf8"])
        assert "Buc^Jérôme" == elem.value

    def test_patched_code_extension_charset(self):
        """Test some commonly misspelled charset values for code extensions."""
        elem = DataElement(
            0x00100010,
            "PN",
            b"Dionysios=\x1b\x2d\x46\xc4\xe9\xef\xed\xf5\xf3\xe9\xef\xf2",
        )
        # correct encoding
        pydicom.charset.decode_element(elem, ["ISO 2022 IR 100", "ISO 2022 IR 126"])
        assert "Dionysios=Διονυσιος" == elem.value

        # patched encoding shall behave correctly, but a warning is issued
        with pytest.warns(
            UserWarning,
            match="Incorrect value for Specific Character Set "
            "'ISO_2022-IR 100' - assuming "
            "'ISO 2022 IR 100'",
        ):
            elem = DataElement(
                0x00100010,
                "PN",
                b"Dionysios=\x1b\x2d\x46\xc4\xe9\xef\xed\xf5\xf3\xe9\xef\xf2",
            )
            pydicom.charset.decode_element(elem, ["ISO_2022-IR 100", "ISO 2022 IR 126"])

            assert "Dionysios=Διονυσιος" == elem.value

        with pytest.warns(
            UserWarning,
            match=r"Incorrect value for Specific Character Set "
            r"'ISO_2022_IR\+126' - assuming "
            r"'ISO 2022 IR 126'",
        ):
            elem = DataElement(
                0x00100010,
                "PN",
                b"Dionysios=\x1b\x2d\x46\xc4\xe9\xef\xed\xf5\xf3\xe9\xef\xf2",
            )
            pydicom.charset.decode_element(elem, ["ISO 2022 IR 100", "ISO_2022_IR+126"])
            assert "Dionysios=Διονυσιος" == elem.value

    def test_multi_charset_default_value(self):
        """Test that the first value is used if no escape code is given"""
        # regression test for #707
        elem = DataElement(0x00100010, "PN", b"Buc^J\xe9r\xf4me")
        pydicom.charset.decode_element(elem, ["ISO 2022 IR 100", "ISO 2022 IR 144"])
        assert "Buc^Jérôme" == elem.value

        elem = DataElement(0x00081039, "LO", b"R\xf6ntgenaufnahme")
        pydicom.charset.decode_element(elem, ["ISO 2022 IR 100", "ISO 2022 IR 144"])
        assert "Röntgenaufnahme" == elem.value

    def test_single_byte_multi_charset_personname(self):
        # component groups with different encodings
        elem = DataElement(
            0x00100010,
            "PN",
            b"Dionysios=\x1b\x2d\x46\xc4\xe9\xef\xed\xf5\xf3\xe9\xef\xf2",
        )
        pydicom.charset.decode_element(elem, ["ISO 2022 IR 100", "ISO 2022 IR 126"])
        assert "Dionysios=Διονυσιος" == elem.value

        # multiple values with different encodings
        encoded = (
            b"Buc^J\xe9r\xf4me\\\x1b\x2d\x46"
            b"\xc4\xe9\xef\xed\xf5\xf3\xe9\xef\xf2\\"
            b"\x1b\x2d\x4c"
            b"\xbb\xee\xda\x63\x65\xdc\xd1\x79\x70\xd3"
        )
        elem = DataElement(0x00100060, "PN", encoded)
        pydicom.charset.decode_element(
            elem, ["ISO 2022 IR 100", "ISO 2022 IR 144", "ISO 2022 IR 126"]
        )
        assert ["Buc^Jérôme", "Διονυσιος", "Люкceмбypг"] == elem.value

    def test_single_byte_multi_charset_text(self):
        # changed encoding inside the string
        elem = DataElement(
            0x00081039,
            "LO",
            b"Dionysios is \x1b\x2d\x46\xc4\xe9\xef\xed\xf5\xf3\xe9\xef\xf2",
        )
        pydicom.charset.decode_element(elem, ["ISO 2022 IR 100", "ISO 2022 IR 126"])
        assert "Dionysios is Διονυσιος" == elem.value

        # multiple values with different encodings
        elem = DataElement(
            0x00081039,
            "LO",
            b"Buc^J\xe9r\xf4me\\\x1b\x2d\x46"
            b"\xc4\xe9\xef\xed\xf5\xf3\xe9\xef\xf2\\"
            b"\x1b\x2d\x4c"
            b"\xbb\xee\xda\x63\x65\xdc\xd1\x79\x70\xd3",
        )
        pydicom.charset.decode_element(
            elem, ["ISO 2022 IR 100", "ISO 2022 IR 144", "ISO 2022 IR 126"]
        )
        assert ["Buc^Jérôme", "Διονυσιος", "Люкceмбypг"] == elem.value

    @pytest.mark.parametrize("encoding, decoded, raw_data", ENCODED_NAMES)
    def test_single_byte_code_extensions(self, encoding, decoded, raw_data):
        # single-byte encoding as code extension
        elem = DataElement(0x00100010, "PN", b"ASCII+" + raw_data)
        pydicom.charset.decode_element(elem, ["", encoding])
        assert "ASCII+" + decoded == elem.value

    def test_missing_escape_for_single_byte_code_extensions(self):
        # missing escape sequence after component delimiter
        raw_data = b"\x1b\x29\x49\xd4\xcf\xc0\xde\x5e\xc0\xdb\xb3"
        elem = DataElement(0x00100010, "PN", b"ASCII+" + raw_data)
        pydicom.charset.decode_element(elem, ["", "ISO 2022 IR 13"])
        # the last part is decoded as Latin1
        assert "ASCII+ﾔﾏﾀﾞ^ÀÛ³" == elem.value

    @pytest.mark.parametrize("filename, patient_name", FILE_PATIENT_NAMES)
    def test_charset_patient_names(self, filename, patient_name):
        """Test patient names are correctly decoded and encoded."""
        # check that patient names are correctly read
        file_path = get_charset_files(filename + ".dcm")[0]
        ds = dcmread(file_path)
        ds.decode()
        assert patient_name == ds.PatientName

        # check that patient names are correctly written back
        fp = DicomBytesIO()
        ds.save_as(fp, enforce_file_format=True, implicit_vr=False)
        fp.seek(0)
        ds = dcmread(fp)
        assert patient_name == ds.PatientName

        # check that patient names are correctly written back
        # without original byte string (PersonName only)
        if hasattr(ds.PatientName, "original_string"):
            ds.PatientName.original_string = None
            fp = DicomBytesIO()
            ds.save_as(fp, enforce_file_format=True)
            fp.seek(0)
            ds = dcmread(fp)
            assert patient_name == ds.PatientName

    def test_changed_character_set(self):
        # Regression test for #629
        multiPN_name = get_charset_files("chrFrenMulti.dcm")[0]
        ds = dcmread(multiPN_name)  # is Latin-1
        ds.SpecificCharacterSet = "ISO_IR 192"
        from pydicom.filebase import DicomBytesIO

        fp = DicomBytesIO()
        ds.save_as(fp, enforce_file_format=True)
        fp.seek(0)
        ds_out = dcmread(fp)
        # we expect UTF-8 encoding here
        assert b"Buc^J\xc3\xa9r\xc3\xb4me" == ds_out.get_item(0x00100010).value

    def test_invalid_second_encoding(self, allow_reading_invalid_values):
        # regression test for #850
        elem = DataElement(0x00100010, "PN", "CITIZEN")
        with pytest.warns(
            UserWarning,
            match="Unknown encoding 'ISO 2022 IR 146' "
            "- using default encoding instead",
        ):
            pydicom.charset.decode_element(elem, ["ISO 2022 IR 100", "ISO 2022 IR 146"])

    def test_invalid_second_encoding_strict(self, enforce_valid_values):
        elem = DataElement(0x00100010, "PN", "CITIZEN")
        with pytest.raises(LookupError, match="Unknown encoding 'ISO 2022 IR 146'"):
            pydicom.charset.decode_element(elem, ["ISO 2022 IR 100", "ISO 2022 IR 146"])

    def test_japanese_multi_byte_personname(self):
        """Test japanese person name which has multi byte strings are
        correctly encoded."""
        file_path = get_charset_files("chrH32.dcm")[0]
        ds = dcmread(file_path)
        ds.decode()

        if hasattr(ds.PatientName, "original_string"):
            original_string = ds.PatientName.original_string
            ds.PatientName.original_string = None
            fp = DicomBytesIO()
            ds.save_as(fp, enforce_file_format=True, implicit_vr=False)
            fp.seek(0)
            ds_out = dcmread(fp)
            assert original_string == ds_out.PatientName.original_string

        japanese_pn = PersonName("Mori^Ogai=森^鷗外=もり^おうがい")
        pyencs = pydicom.charset.convert_encodings(
            ["ISO 2022 IR 6", "ISO 2022 IR 87", "ISO 2022 IR 159"]
        )
        actual_encoded = bytes(japanese_pn.encode(pyencs))
        expect_encoded = (
            b"\x4d\x6f\x72\x69\x5e\x4f\x67\x61\x69\x3d\x1b\x24\x42\x3f"
            b"\x39\x1b\x28\x42\x5e\x1b\x24\x28\x44\x6c\x3f\x1b\x24\x42"
            b"\x33\x30\x1b\x28\x42\x3d\x1b\x24\x42\x24\x62\x24\x6a\x1b"
            b"\x28\x42\x5e\x1b\x24\x42\x24\x2a\x24\x26\x24\x2c\x24\x24"
            b"\x1b\x28\x42"
        )
        assert expect_encoded == actual_encoded

    def test_japanese_multi_byte_encoding(self):
        """Test japanese multi byte strings are correctly encoded."""
        encoded = pydicom.charset.encode_string(
            "あaｱア齩", ["shift_jis", "iso2022_jp", "iso2022_jp_2"]
        )
        expect = b'\x1b$B$"\x1b(Ja\x1b)I\xb1\x1b$B%"\x1b$(DmN\x1b(J'
        assert expect == bytes(encoded)

    def test_bad_japanese_encoding(self):
        """Test japanese multi byte strings are not correctly encoded."""
        with pytest.warns(
            UserWarning,
            match="Failed to encode value with encodings"
            ": shift_jis - using replacement character"
            "s in encoded string",
        ):
            encoded = pydicom.charset.encode_string("あaｱア", ["shift_jis"])
            assert b"?a??" == encoded
